{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1484609c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "\n",
    "snapshot_download(\n",
    "    repo_type='dataset',\n",
    "    repo_id=\"mesolitica/Malaysian-Speech-Instructions\", \n",
    "    allow_patterns=\"without-audio/*.parquet\", local_dir = './')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "f64a03f8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mallm.parquet  speech-instructions.parquet  tatabahasa.parquet\r\n"
     ]
    }
   ],
   "source": [
    "!ls without-audio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ae39f86c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "import os\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4cd8834e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "350833"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = []\n",
    "for f in glob('without-audio/*.parquet'):\n",
    "    data.extend(pd.read_parquet(f).to_dict(orient = 'records'))\n",
    "    \n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "085fe789",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>answer</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Malaysia's public transport system is fragment...</td>\n",
       "      <td>Sistem pengangkutan awam Malaysia telah mengha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Malaysia is vulnerable to climate change due t...</td>\n",
       "      <td>Malaysia is indeed one of the countries most v...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>PDRM berusaha keras untuk menangani aktiviti A...</td>\n",
       "      <td>Perkembangan terkini dalam menangani isu aktiv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Usaha membantu masyarakat Bajau di Sabah terma...</td>\n",
       "      <td>Untuk membantu masyarakat Bajau di Sabah, terd...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Saya adalah AI yang direka untuk membantu dan ...</td>\n",
       "      <td>Itulah betul! Saya adalah program komputer, te...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              answer  \\\n",
       "0  Malaysia's public transport system is fragment...   \n",
       "1  Malaysia is vulnerable to climate change due t...   \n",
       "2  PDRM berusaha keras untuk menangani aktiviti A...   \n",
       "3  Usaha membantu masyarakat Bajau di Sabah terma...   \n",
       "4  Saya adalah AI yang direka untuk membantu dan ...   \n",
       "\n",
       "                                                text  \n",
       "0  Sistem pengangkutan awam Malaysia telah mengha...  \n",
       "1  Malaysia is indeed one of the countries most v...  \n",
       "2  Perkembangan terkini dalam menangani isu aktiv...  \n",
       "3  Untuk membantu masyarakat Bajau di Sabah, terd...  \n",
       "4  Itulah betul! Saya adalah program komputer, te...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary = pd.read_parquet('summarization.parquet')\n",
    "summary.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "86565112",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████| 1264550/1264550 [00:15<00:00, 81499.87it/s]\n"
     ]
    }
   ],
   "source": [
    "summaries = {}\n",
    "for i in tqdm(range(len(summary))):\n",
    "    summaries[summary['text'].iloc[i]] = summary['answer'].iloc[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "070a0ecf",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_prompt = [\n",
    "    'you are chatbot that use for voice assistant, your response always precise and shorter than 300 characters',\n",
    "    'You are a chatbot designed specifically for use as a voice assistant. Your responses must be clear, precise, and always shorter than 300 characters. Prioritize natural, conversational language suitable for speech. Avoid filler, long explanations, or complex phrasing. Do not include unnecessary details unless explicitly asked. If a user’s request is ambiguous, briefly ask for clarification within the character limit. Aim to be helpful, fast, and concise in all replies.',\n",
    "    'You are a helpful voice assistant chatbot. Always reply with short, clear answers under 300 characters. Speak naturally, like in conversation. Keep it direct—no fluff, no long explanations. If you’re unsure, ask a quick clarifying question.',\n",
    "    'Act as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don’t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times.',\n",
    "    \"You're a chatbot for voice use. Talk like a helpful person, not a machine. Keep it short—under 300 characters. Focus on what the user needs. Don’t ramble. Be polite, fast, and smart. If the user’s unclear, just ask quickly and keep going.\",\n",
    "    'You are a voice-based assistant designed to be brief and precise. All replies must be under 300 characters. Speak in a friendly, natural tone suitable for spoken output. Avoid technical jargon and long replies. Ask short questions if more info is needed.',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "865a81e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'role': 'user', 'content': [{'type': 'audio', 'audio_url': 'audio.wav'}]},\n",
       " {'role': 'assistant', 'content': 'B. buaya tembaga'}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "json.loads(data[0]['prompt'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6ebeaf1f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 350833/350833 [00:07<00:00, 45489.97it/s]\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "processed = []\n",
    "for i in tqdm(range(len(data))):\n",
    "    conversation = json.loads(data[i]['prompt'])\n",
    "    messages = [{'role': 'system', 'content': random.choice(system_prompt)}]\n",
    "    for c in conversation:\n",
    "        if c['role'] == 'assistant':\n",
    "            s = c['content']\n",
    "            if s is None:\n",
    "                break\n",
    "            if len(c['content']) > 300:\n",
    "                if c['content'] in summaries:\n",
    "                    s = summaries[c['content']]\n",
    "                    if s is None:\n",
    "                        s = c['content']\n",
    "                else:\n",
    "                    break\n",
    "            c['content'] = s\n",
    "        messages.append(c)\n",
    "        \n",
    "    if len(messages) % 2 == 0:\n",
    "        continue\n",
    "        \n",
    "    processed.append({\n",
    "        'prompt': json.dumps(messages),\n",
    "        'audio_filename': data[i]['audio_filename'],\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2a6da885",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "143451"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "a35515c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'prompt': '[{\"role\": \"system\", \"content\": \"You are a helpful voice assistant chatbot. Always reply with short, clear answers under 300 characters. Speak naturally, like in conversation. Keep it direct\\\\u2014no fluff, no long explanations. If you\\\\u2019re unsure, ask a quick clarifying question.\"}, {\"role\": \"user\", \"content\": \"Bagaimana mungkin Malaysia masih mempunyai undang-undang yang menjadikan wanita tidak sah keluar di khalayak ramai selepas tengah malam?\"}, {\"role\": \"assistant\", \"content\": \"Di Malaysia, undang-undang batasan kediaman menghalang wanita keluar lewat malam di tempat awam. Ini ketinggalan zaman dan diskriminatif, melanggar hak kebebasan pergerakan. Usaha sedang dijalankan untuk reformasi atau pembatalan undang-undang tersebut.\"}, {\"role\": \"user\", \"content\": [{\"type\": \"audio\", \"audio_url\": \"audio.wav\"}]}, {\"role\": \"assistant\", \"content\": \"Enakmen Kediaman Terhad di Malaysia tengah dikaji untuk dipinda demi mengurangkan diskriminasi. Usaha sedang dibuat untuk menjadikan enakmen itu lebih neutral jantina dan fleksibel. Majlis Peguam Malaysia juga menyeru pemansuhannya kerana dianggap diskriminasi.\"}, {\"role\": \"user\", \"content\": \"Sebagai pengguna, saya ingin tahu lebih lanjut mengenai spesifik pindaan yang dicadangkan kepada Enakmen Kediaman Terhad di Selangor. Bolehkah anda memberikan maklumat lanjut tentang apa yang dimaksudkan dengan pindaan ini dan bila ia dijangka akan dilaksanakan?\"}, {\"role\": \"assistant\", \"content\": \"Selangor proposed amendments to Restrictive Residency Enactment in 2019 to make it gender-neutral and more flexible. Changes include exceptions for certain activities, higher fines and penalties, clarified enforcement powers, and public education. Implementation timeline is unclear.\"}]',\n",
       " 'audio_filename': 'partition-instructions-part-7/17574.mp3'}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[10000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "b2090b94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'prompt': '[{\"role\": \"system\", \"content\": \"Act as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don\\\\u2019t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times.\"}, {\"role\": \"user\", \"content\": [{\"type\": \"audio\", \"audio_url\": \"audio.wav\"}, {\"type\": \"text\", \"text\": \"\"}]}, {\"role\": \"assistant\", \"content\": \"PBB main peranan penting jaga keselamatan dunia. Dia gune agensi keselamatan, diplomasi, dan sokong negara krisis. Tujuan utama PBB cegah perang, promote kerjasama antara negara, dan pastikan ketenteraman dunia.\"}, {\"role\": \"user\", \"content\": \"Bagaimana pula PBB berusaha mempromosikan hak asasi manusia di seluruh dunia?\"}, {\"role\": \"assistant\", \"content\": \"PBB berperanan promosi hak asasi manusia dengan deklarasi, pemantauan pelanggaran, sidang berkala, latihan negara, dan kempen kesedaran. Tujuannya ialah untuk memastikan keadilan, kesaksamaan, dan penghormatan terhadap hak asasi manusia di seluruh dunia.\"}]',\n",
       " 'audio_filename': 'partition-instructions-part-3/4142.mp3'}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[100000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d697b24f",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(processed).to_parquet('Malaysian-Speech-Instructions-summary.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "9e1b7662",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|██████████████████████████| 118M/118M [00:12<00:00, 9.21MB/s]\n",
      "https://huggingface.co/datasets/mesolitica/Malaysian-Speech-Instructions/blob/main/without-audio/voice_assistant.parquet\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Malaysian-Speech-Instructions \\\n",
    "Malaysian-Speech-Instructions-summary.parquet without-audio/voice_assistant.parquet \\\n",
    "--repo-type=dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "509285f3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
